{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-02-16T12:54:17.971277Z",
     "start_time": "2025-02-16T12:54:16.519736Z"
    }
   },
   "source": [
    "import os\n",
    "\n",
    "from sklearn.datasets import fetch_california_housing\n",
    "from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression, Lasso\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score\n",
    "import joblib\n",
    "import pandas as pd\n",
    "import numpy as np"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-16T12:54:19.340838Z",
     "start_time": "2025-02-16T12:54:19.329082Z"
    }
   },
   "cell_type": "code",
   "source": [
    "\"\"\"\n",
    "线性回归直接预测房子价格\n",
    ":return: None\n",
    "\"\"\"\n",
    "# 获取数据\n",
    "fe_cal = fetch_california_housing(data_home='../data')\n",
    "\n",
    "print(\"获取特征值\")\n",
    "print(fe_cal.data.shape)\n",
    "print('-' * 50)\n",
    "print(fe_cal.data[0])\n",
    "print(\"目标值\")\n",
    "print(fe_cal.target) #单位是10万美金\n",
    "print(fe_cal.DESCR)\n",
    "print('-' * 50)\n",
    "print(fe_cal.feature_names) #特征列的名字"
   ],
   "id": "190edc03e3bfde95",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "获取特征值\n",
      "(20640, 8)\n",
      "--------------------------------------------------\n",
      "[   8.3252       41.            6.98412698    1.02380952  322.\n",
      "    2.55555556   37.88       -122.23      ]\n",
      "目标值\n",
      "[4.526 3.585 3.521 ... 0.923 0.847 0.894]\n",
      ".. _california_housing_dataset:\n",
      "\n",
      "California Housing dataset\n",
      "--------------------------\n",
      "\n",
      "**Data Set Characteristics:**\n",
      "\n",
      ":Number of Instances: 20640\n",
      "\n",
      ":Number of Attributes: 8 numeric, predictive attributes and the target\n",
      "\n",
      ":Attribute Information:\n",
      "    - MedInc        median income in block group\n",
      "    - HouseAge      median house age in block group\n",
      "    - AveRooms      average number of rooms per household\n",
      "    - AveBedrms     average number of bedrooms per household\n",
      "    - Population    block group population\n",
      "    - AveOccup      average number of household members\n",
      "    - Latitude      block group latitude\n",
      "    - Longitude     block group longitude\n",
      "\n",
      ":Missing Attribute Values: None\n",
      "\n",
      "This dataset was obtained from the StatLib repository.\n",
      "https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n",
      "\n",
      "The target variable is the median house value for California districts,\n",
      "expressed in hundreds of thousands of dollars ($100,000).\n",
      "\n",
      "This dataset was derived from the 1990 U.S. census, using one row per census\n",
      "block group. A block group is the smallest geographical unit for which the U.S.\n",
      "Census Bureau publishes sample data (a block group typically has a population\n",
      "of 600 to 3,000 people).\n",
      "\n",
      "A household is a group of people residing within a home. Since the average\n",
      "number of rooms and bedrooms in this dataset are provided per household, these\n",
      "columns may take surprisingly large values for block groups with few households\n",
      "and many empty houses, such as vacation resorts.\n",
      "\n",
      "It can be downloaded/loaded using the\n",
      ":func:`sklearn.datasets.fetch_california_housing` function.\n",
      "\n",
      ".. rubric:: References\n",
      "\n",
      "- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,\n",
      "  Statistics and Probability Letters, 33 (1997) 291-297\n",
      "\n",
      "--------------------------------------------------\n",
      "['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']\n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-16T12:54:30.637623Z",
     "start_time": "2025-02-16T12:54:30.628793Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 分割数据集到训练集和测试集\n",
    "x_train, x_test, y_train, y_test = train_test_split(fe_cal.data, fe_cal.target, test_size=0.25, random_state=1)\n",
    "#\n",
    "print(x_train.shape)\n",
    "#\n",
    "# # 进行标准化处理(?) 目标值处理？\n",
    "# # 特征值和目标值是都必须进行标准化处理, 实例化两个标准化API\n",
    "std_x = StandardScaler()\n",
    "#\n",
    "x_train = std_x.fit_transform(x_train) #训练集标准化\n",
    "x_test = std_x.transform(x_test) #测试集标准化\n",
    "\n",
    "# 目标值进行了标准化，暂时没有对目标值进行标准化处理\n",
    "# std_y = StandardScaler()\n",
    "#\n",
    "# temp = y_train.reshape(-1, 1) #-1代表把剩余的元素都堆到哪一维\n",
    "#\n",
    "# #标签进行标准化\n",
    "# # 目标值是一维的，这里需要传进去2维的\n",
    "# y_train = std_y.fit_transform(y_train.reshape(-1, 1))\n",
    "# print(y_train.shape)\n",
    "# y_test = std_y.transform(y_test.reshape(-1, 1))\n",
    "# print(y_test.shape)"
   ],
   "id": "6697e07c39778d2",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15480, 8)\n"
     ]
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-16T12:54:48.552289Z",
     "start_time": "2025-02-16T12:54:48.541942Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import os\n",
    "# # estimator预测\n",
    "# # # 正规方程求解方式预测结果，正规方程进行线性回归\n",
    "lr = LinearRegression()\n",
    "# fit是耗时的\n",
    "lr.fit(x_train, y_train)\n",
    "#回归系数可以看特征与目标之间的相关性\n",
    "print('回归系数', lr.coef_)\n",
    "#\n",
    "y_predict = lr.predict(x_test)\n",
    "# 预测测试集的房子价格，通过inverse得到真正的房子价格\n",
    "#y_lr_predict = std_y.inverse_transform(y_predict)\n",
    "# 保存训练好的模型，模型中保存的是w的值，也保存了模型结构\n",
    "#保存模型放在fit之后即可\n",
    "os.unlink('../tmp/test.pkl') # 删除之前的模型文件\n",
    "joblib.dump(lr, \"../tmp/test.pkl\")\n",
    "print(\"正规方程测试集里面每个房子的预测价格：\", y_predict[0:10])\n",
    "#下面是求测试集的损失，用均方误差，公式是(y_test-y_predict)^2/n\n",
    "print(\"正规方程的均方误差：\", mean_squared_error(y_test, y_predict))"
   ],
   "id": "d20e932742e1b8a4",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "回归系数 [ 0.83167028  0.12159502 -0.26758589  0.30983997 -0.00518054 -0.04040421\n",
      " -0.90736902 -0.88212727]\n",
      "正规方程测试集里面每个房子的预测价格： [2.12391852 0.93825754 2.7088455  1.70873764 2.82954754 3.50376456\n",
      " 3.0147162  1.62781292 1.74317518 2.01897806]\n",
      "正规方程的均方误差： 0.5356532845422556\n"
     ]
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# 2 加载保存的模型",
   "id": "a020deab0a711cbf"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-16T12:55:17.520959Z",
     "start_time": "2025-02-16T12:55:17.509059Z"
    }
   },
   "cell_type": "code",
   "source": [
    "#模拟上线时加载模型\n",
    "model = joblib.load(\"../tmp/test.pkl\")\n",
    "# # 因为目标值进行了标准化，一定要把预测后的值逆向转换回来\n",
    "y_predict = model.predict(x_test)\n",
    "\n",
    "#\n",
    "print(\"保存的模型预测的结果：\", y_predict)\n",
    "print(\"正规方程的均方误差：\", mean_squared_error(y_test, y_predict))\n",
    "\n",
    "# print(\"正规方程inverse后的均方误差：\", mean_squared_error(std_y.inverse_transform(y_test),\n",
    "#                                                std_y.inverse_transform(y_predict)))"
   ],
   "id": "4776dc471e034f13",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "保存的模型预测的结果： [2.12391852 0.93825754 2.7088455  ... 1.24263061 2.73771901 1.75800594]\n",
      "正规方程的均方误差： 0.5356532845422556\n"
     ]
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# 3 线性回归之梯度下降去进行房价预测",
   "id": "b12e5d1ec331d516"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-16T12:55:44.585243Z",
     "start_time": "2025-02-16T12:55:44.560788Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 梯度下降去进行房价预测,数据量大要用这个\n",
    "# learning_rate的不同方式，代表学习率变化的算法不一样,比如constant,invscaling,adaptive\n",
    "# 默认可以去调 eta0 = 0.008，会改变learning_rate的初始值\n",
    "# learning_rate='optimal',alpha是正则化力度，但是会影响学习率的值，由alpha来算学习率\n",
    "# penalty代表正则化，分为l1和l2\n",
    "# eta0=0.01, penalty='l2',max_iter=1000\n",
    "sgd = SGDRegressor(eta0=0.01,penalty='l2', max_iter=1000)\n",
    "# # 训练\n",
    "sgd.fit(x_train, y_train)\n",
    "#\n",
    "print('梯度下降的回归系数', sgd.coef_)\n",
    "#\n",
    "# 预测测试集的房子价格\n",
    "# y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))\n",
    "y_predict = sgd.predict(x_test)\n",
    "# print(\"梯度下降测试集里面每个房子的预测价格：\", y_sgd_predict)\n",
    "print(\"梯度下降的均方误差：\", mean_squared_error(y_test, y_predict))\n",
    "# print(\"梯度下降的原始房价量纲均方误差：\", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))"
   ],
   "id": "e219fde968a1ca8f",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "梯度下降的回归系数 [ 8.13240273e-01  1.32913159e-01 -1.84281267e-01  3.67433078e-01\n",
      " -7.27372775e-04 -3.88060101e-02 -9.12524095e-01 -8.88101417e-01]\n",
      "梯度下降的均方误差： 0.5449332540943888\n"
     ]
    }
   ],
   "execution_count": 8
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-17T02:41:23.413236Z",
     "start_time": "2025-02-17T02:41:23.408169Z"
    }
   },
   "cell_type": "code",
   "source": [
    "w=1\n",
    "alpha=0.1\n",
    "def loss(w):\n",
    "    return 2*w**2+3*w+2\n",
    "def dao_shu(w):\n",
    "    return 4*w+3\n",
    "for i in range(30):\n",
    "    w=w-alpha*dao_shu(w)\n",
    "    print(f'w {w} 损失{loss(w)}')"
   ],
   "id": "681e3b1f3145c6b1",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "w 0.29999999999999993 损失3.0799999999999996\n",
      "w -0.12 损失1.6688\n",
      "w -0.372 损失1.160768\n",
      "w -0.5232 损失0.9778764800000002\n",
      "w -0.61392 损失0.9120355328\n",
      "w -0.6683520000000001 损失0.888332791808\n",
      "w -0.7010112000000001 损失0.87979980505088\n",
      "w -0.72060672 损失0.8767279298183168\n",
      "w -0.732364032 损失0.8756220547345941\n",
      "w -0.7394184192 损失0.8752239397044537\n",
      "w -0.74365105152 损失0.8750806182936035\n",
      "w -0.746190630912 损失0.8750290225856971\n",
      "w -0.7477143785472 损失0.8750104481308509\n",
      "w -0.74862862712832 损失0.8750037613271064\n",
      "w -0.749177176276992 损失0.8750013540777584\n",
      "w -0.7495063057661951 损失0.8750004874679931\n",
      "w -0.7497037834597171 损失0.8750001754884773\n",
      "w -0.7498222700758302 损失0.8750000631758519\n",
      "w -0.7498933620454982 损失0.8750000227433066\n",
      "w -0.7499360172272989 损失0.8750000081875902\n",
      "w -0.7499616103363793 损失0.8750000029475327\n",
      "w -0.7499769662018276 损失0.8750000010611116\n",
      "w -0.7499861797210966 损失0.8750000003820002\n",
      "w -0.749991707832658 损失0.87500000013752\n",
      "w -0.7499950246995948 损失0.8750000000495071\n",
      "w -0.7499970148197569 损失0.8750000000178226\n",
      "w -0.7499982088918541 损失0.8750000000064162\n",
      "w -0.7499989253351125 损失0.8750000000023095\n",
      "w -0.7499993552010675 损失0.8750000000008316\n",
      "w -0.7499996131206406 损失0.8750000000002991\n"
     ]
    }
   ],
   "execution_count": 10
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
